import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from statsforecast import StatsForecast
from statsforecast.models import ARIMA, Naive, SeasonalNaive
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean
from mlforecast.lgb_cv import LightGBMCV
from mlforecast.target_transforms import Differences, LocalStandardScaler
from utilsforecast.losses import mape, rmse
from utilsforecast.plotting import plot_series
from pythonProject.utils import read_and_preprocess_data, make_res_df, wmape, plot_cv, plot_test_predictions
import warnings
warnings.filterwarnings("ignore")
/home/mariia/anaconda3/envs/pets/lib/python3.8/site-packages/statsforecast/core.py:27: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from tqdm.autonotebook import tqdm
While working on this test, I came across a great Decathlon technical blog: Time-Series Forecast at Scale: Data, Modeling, and Monitoring Approaches. It provided me with valuable context and answered many of my questions :)
For the solution, we will use Nixtla time series packages, specifically mlforecast and statsforecast. Some helper functions are placed in utils.py to keep the notebook clean and uncluttered.
*The predictions for the test period are at data-ds-test/data/test.parquet.
**Some helper functions are at pythonProject.utils
df_train_raw = pd.read_parquet('data-ds-test/data/train.parquet')
df_train_raw['ds'] = pd.to_datetime(df_train_raw['day_id'])
df_train_raw['y'] = df_train_raw.turnover
df_train_raw['unique_id'] = df_train_raw.but_num_business_unit.astype(
str) + '_' + df_train_raw.dpt_num_department.astype(str)
Let's start by plotting some series to get an idea of what we are working with.
plot_series(df_train_raw[df_train_raw.dpt_num_department == 73], max_ids=4, plot_random=True, engine='plotly', seed=1)
plot_series(df_train_raw[df_train_raw.dpt_num_department == 88], max_ids=4, plot_random=True, engine='plotly', seed=1)
plot_series(df_train_raw[df_train_raw.dpt_num_department == 117], max_ids=4, plot_random=True, engine='plotly', seed=1)
plot_series(df_train_raw[df_train_raw.dpt_num_department == 127], max_ids=4, plot_random=True, engine='plotly', seed=1)
Observations
We will check for unrealistically large or negative values of turnover to ensure data quality.
(df_train_raw.y<0).mean()
0.018180246940252557
df_train_raw.y.nlargest(10)
231272 1000000.000000 79883 18006.051210 79043 17842.377939 81084 17810.939184 78130 17460.877694 77980 17222.942727 78944 16546.662169 76072 16527.720611 80750 16342.892423 78954 16131.394695 Name: y, dtype: float64
It looks like we have 1.8% of turnover values that are negative, which doesn't make much sense. Additionally, there is one unrealistically large value. We'll use the following code to clean the data:
y_mean = df.y.mean()
y_std = df.y.std()
df.y = df.y.apply(lambda y: y_mean if y > y_mean + 10 * y_std else y)
df.y = df.y.apply(lambda y: y if y > 0 else 0)
All the data cleaning strategies are implemented in the read_and_preprocess_data function from the utils file.
Next, we will create a pivot table to facilitate the inspection of individual series:
train_pivot = df_train_raw.sort_values('unique_id').pivot(index='day_id',
columns='unique_id',
values=['turnover'])
train_pivot.columns = train_pivot.columns.to_series().str.join('_')
train_pivot.head()
| turnover_100_117 | turnover_100_127 | turnover_100_73 | turnover_100_88 | turnover_1015_117 | turnover_1015_127 | turnover_1015_73 | turnover_1015_88 | turnover_1067_117 | turnover_1067_127 | ... | turnover_98_73 | turnover_98_88 | turnover_99_117 | turnover_99_127 | turnover_99_73 | turnover_99_88 | turnover_9_117 | turnover_9_127 | turnover_9_73 | turnover_9_88 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| day_id | |||||||||||||||||||||
| 2012-12-29 | 3736.141601 | 2082.464788 | 334.798246 | 744.595088 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 208.117919 | 2408.956300 | 869.936420 | 19.686920 | 248.917440 | 1852.874152 | 752.223085 | -0.220862 | 305.723096 |
| 2013-01-05 | 2707.991128 | 1500.990811 | 190.886858 | 518.276689 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 118.757732 | 1778.503351 | 542.189545 | 20.698755 | 293.975868 | 1190.472999 | 527.444534 | -0.951773 | 153.688271 |
| 2013-01-12 | 6069.299982 | 3255.249223 | 636.553609 | 1073.229327 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 251.009347 | 1873.070172 | 593.202804 | 26.254471 | 489.048384 | 1661.444513 | 820.065897 | 0.317787 | 483.090013 |
| 2013-01-19 | 5700.357353 | 3635.477702 | 469.807777 | 1015.391017 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 160.768571 | 1900.132379 | 805.089490 | 36.352543 | 439.037391 | 2311.364254 | 1188.233234 | 0.000000 | 443.040875 |
| 2013-01-26 | 4663.746192 | 2721.591210 | 441.191035 | 841.873151 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 175.388692 | 1773.939554 | 716.246522 | 21.723301 | 437.726519 | 2079.897553 | 1134.959697 | -0.792879 | 433.908305 |
5 rows × 1270 columns
train_pivot.isnull().sum().sort_values()[-10:]
turnover_1887_88 232 turnover_1881_88 233 turnover_1869_117 233 turnover_1869_73 233 turnover_1876_73 235 turnover_1876_127 235 turnover_1876_117 235 turnover_1876_88 235 turnover_1882_88 240 turnover_1881_117 241 dtype: int64
Some series have many NaN values, especially those corresponding to recently opened shops. When evaluating the model, we prefer to avoid having NaNs, at least in the evaluation period. Therefore, we will create a cleaned version of the dataset:
good_ids = [i.replace('y_', '') for i in train_pivot[train_pivot.index > '2016-10-01'].dropna(axis=1).columns]
df_train = df_train_full[df_train_full.apply(lambda row: row['unique_id'] in good_ids, axis=1)]
However, for the final model, we will use LightGBM, which does not require any special treatment for NaNs. So, when making the final prediction, we will leave the NaNs in place.
import statsmodels.tsa.api as smt
def tsplot(y, lags=None, figsize=(15, 15), style='bmh'):
if not isinstance(y, pd.Series):
y = pd.Series(y)
with plt.style.context(style):
fig = plt.figure(figsize=figsize)
layout = (3, 2)
ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
y.plot(ax=ts_ax)
ts_ax.set_title('Time Series Analysis Plots')
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.7)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.7)
plt.tight_layout()
plt.show()
return
tsplot(train_pivot['turnover_117_73'], 60)
tsplot(pd.Series(train_pivot['turnover_117_73'][1:].values - train_pivot['turnover_117_73'][:-1].values,
index=train_pivot.index[1:]), 60)
As can be seen, our time series are non-stationary. Differencing helps a bit, so for ARIMA, we definitely need to apply differencing.
We will evaluate all models using time-based cross-validation with a validation period length of 8 weeks, as our final goal is to forecast the 8-week test period. We will use 4 different train-validation cutoffs.
Baseline Models:
df_train, df_test, df_train_full = read_and_preprocess_data()
#df_train_full is dataset with series containing nans for final prediction only
cv_params = {
'h': 8,
'n_windows': 4
}
sf = StatsForecast(
models = [
Naive(),
SeasonalNaive(season_length=52),
ARIMA(season_length=52, order=(2,1,2), alias='ARIMA212'),
ARIMA(season_length=52, order=(5,1,1), alias='ARIMA511')
],
freq = 'W'
)
cv_df_baselines = sf.cross_validation(df=df_train, **cv_params)
all_res = make_res_df(cv_df_baselines, metric=wmape, model_name='Naive')
snaive = make_res_df(cv_df_baselines, metric=wmape, model_name='SeasonalNaive')
all_res = all_res.merge(snaive, on='dep name')
arima_res = make_res_df(cv_df_baselines, metric=wmape, model_name='ARIMA212')
all_res = all_res.merge(arima_res, on='dep name')
arima_res1 = make_res_df(cv_df_baselines, metric=wmape, model_name='ARIMA511')
all_res = all_res.merge(arima_res1, on='dep name')
all_res
| dep name | Naive wmape | SeasonalNaive wmape | ARIMA212 wmape | ARIMA511 wmape | |
|---|---|---|---|---|---|
| 0 | global | 0.552005 | 0.647247 | 0.525967 | 0.484144 |
| 1 | 73 | 0.429084 | 0.771686 | 0.398694 | 0.378727 |
| 2 | 88 | 0.218117 | 0.189890 | 0.181112 | 0.182256 |
| 3 | 117 | 0.920356 | 0.843363 | 1.405544 | 9.392929 |
| 4 | 127 | 0.662031 | 0.773375 | 0.637668 | 0.548976 |
LightGBM is known for its strong performance in time-series competitions on Kaggle, so we are hopeful it will outperform our baselines.
We will use lag features to encapsulate information about the past 2 years for each time series. Additionally, we will incorporate month and year features, as well as the geographical features provided in bu_feat.parquet.
We will use the Optuna package to find the best hyperparameters for the LightGBM model.
Note: Since we are using our cross-validation splits for hyperparameter optimization of the LightGBM model, it is not entirely correct to compare its results to the baselines on the same splits. To be completely rigorous, we should have a separate test period used only for comparison purposes.
import optuna
def get_hyperparams(trial):
hyperparams_grid = {
"n_jobs": 16,
"boosting_type": trial.suggest_categorical(
"boosting_type", ["dart", "gbdt"]
),
"num_leaves": trial.suggest_int("num_leaves", 2, 200, log=True),
"max_depth": trial.suggest_int("max_depth", 1, 8),
"n_estimators": trial.suggest_int("n_estimators", 10, 1000),
"learning_rate": trial.suggest_float("learning_rate", 1e-5, 1, log=True),
"subsample_freq": trial.suggest_int("subsample_freq", 1, 100),
"subsample": trial.suggest_float("subsample", 0.1, 1.0),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 10, log=True),
"reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 10, log=True),
"min_split_gain": trial.suggest_float("min_split_gain", 1e-6, 10, log=True),
"min_child_samples": trial.suggest_int("min_child_samples", 1, 200),
}
return hyperparams_grid
def objective_T(trial):
hyperparams = get_hyperparams(trial)
models = [LGBMRegressor(verbose=-1, **hyperparams)]
mlf = MLForecast(
models=models,
freq='W-SAT',
lags=range(1, 2*52 + 1),
date_features=['month', 'year'],
)
crossvalidation_df = mlf.cross_validation(
df=df_train,
static_features=['dpt_num_department', 'but_region_idr_region', 'zod_idr_zone_dgr'],
**cv_params
)
lgbm_res = make_res_df(crossvalidation_df, model_name='LGBMRegressor', metric=wmape)
return lgbm_res.iloc[0,1]
study_T = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=0))
def func_T(trial):
return objective_T(trial)
study_T.optimize(func_T, n_trials=50, n_jobs=1)
[I 2024-06-11 21:18:39,308] A new study created in memory with name: no-name-4a718b3c-5c68-4d7b-a0c9-d2413fff0f57
[I 2024-06-11 21:18:50,785] Trial 0 finished with value: 0.2375669640873192 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 29, 'max_depth': 5, 'n_estimators': 429, 'learning_rate': 0.01696174638729099, 'subsample_freq': 44, 'subsample': 0.9025957007038717, 'reg_alpha': 5.567232047731109, 'reg_lambda': 0.00048315962083818554, 'min_split_gain': 0.3483976898071592, 'min_child_samples': 106}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:18:54,446] Trial 1 finished with value: 0.4104687772942863 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 2, 'max_depth': 1, 'n_estimators': 30, 'learning_rate': 0.14557916623227773, 'subsample_freq': 78, 'subsample': 0.8830109334221372, 'reg_alpha': 7.08481306320402, 'reg_lambda': 0.3927443531634939, 'min_split_gain': 0.0016996344911746593, 'min_child_samples': 157}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:19:02,465] Trial 2 finished with value: 1.005427016666839 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 3, 'max_depth': 8, 'n_estimators': 527, 'learning_rate': 0.0011838854959491769, 'subsample_freq': 27, 'subsample': 0.796810320490795, 'reg_alpha': 0.0015597404151970109, 'reg_lambda': 0.009528787503464746, 'min_split_gain': 1.3537192449938794e-06, 'min_child_samples': 124}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:19:12,872] Trial 3 finished with value: 0.9947813366511403 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 152, 'max_depth': 6, 'n_estimators': 366, 'learning_rate': 0.00153165082437487, 'subsample_freq': 70, 'subsample': 0.15420292446634287, 'reg_alpha': 0.04649079878613853, 'reg_lambda': 0.049484032940405646, 'min_split_gain': 2.9694630632544778e-05, 'min_child_samples': 26}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:19:26,521] Trial 4 finished with value: 1.2721142296650914 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 24, 'max_depth': 4, 'n_estimators': 989, 'learning_rate': 3.237606428745273e-05, 'subsample_freq': 21, 'subsample': 0.24517856609649663, 'reg_alpha': 0.03730424702045822, 'reg_lambda': 5.929816002764688e-05, 'min_split_gain': 0.001837280264462447, 'min_child_samples': 49}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:19:32,061] Trial 5 finished with value: 0.9170675262102012 and parameters: {'boosting_type': 'dart', 'num_leaves': 37, 'max_depth': 2, 'n_estimators': 204, 'learning_rate': 0.0006976311326833803, 'subsample_freq': 83, 'subsample': 0.18739114821375513, 'reg_alpha': 0.7338619194791532, 'reg_lambda': 4.706400215419442e-06, 'min_split_gain': 6.842522829851383, 'min_child_samples': 94}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:19:38,376] Trial 6 finished with value: 0.9209863492907047 and parameters: {'boosting_type': 'dart', 'num_leaves': 56, 'max_depth': 1, 'n_estimators': 290, 'learning_rate': 3.9900910508086456e-05, 'subsample_freq': 30, 'subsample': 0.20685494705881965, 'reg_alpha': 0.00016822179228155511, 'reg_lambda': 0.0007940374054212921, 'min_split_gain': 2.8121110986084826e-06, 'min_child_samples': 139}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:19:52,429] Trial 7 finished with value: 0.2904861076107322 and parameters: {'boosting_type': 'dart', 'num_leaves': 19, 'max_depth': 1, 'n_estimators': 580, 'learning_rate': 0.443078817393754, 'subsample_freq': 32, 'subsample': 0.7006693419673136, 'reg_alpha': 8.36729415134254e-06, 'reg_lambda': 0.1033452282604875, 'min_split_gain': 0.00010613109636245979, 'min_child_samples': 37}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:20:13,347] Trial 8 finished with value: 0.9222271225002557 and parameters: {'boosting_type': 'dart', 'num_leaves': 87, 'max_depth': 1, 'n_estimators': 681, 'learning_rate': 0.0002238926650886215, 'subsample_freq': 74, 'subsample': 0.9659696906056944, 'reg_alpha': 5.511527829375221e-05, 'reg_lambda': 0.010791985196195139, 'min_split_gain': 0.013940986893577242, 'min_child_samples': 115}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:20:29,665] Trial 9 finished with value: 1.2527669794458118 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 13, 'max_depth': 7, 'n_estimators': 703, 'learning_rate': 0.00030703278662754413, 'subsample_freq': 82, 'subsample': 0.4568551667622862, 'reg_alpha': 1.4713716359816806, 'reg_lambda': 0.011719527662121372, 'min_split_gain': 1.4864405111461017, 'min_child_samples': 139}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:20:42,127] Trial 10 finished with value: 0.24780321576406267 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 7, 'max_depth': 4, 'n_estimators': 924, 'learning_rate': 0.02698447603303874, 'subsample_freq': 3, 'subsample': 0.5701953408698344, 'reg_alpha': 0.03939048848317799, 'reg_lambda': 5.76042048594392, 'min_split_gain': 0.15903202610220296, 'min_child_samples': 184}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:20:55,319] Trial 11 finished with value: 0.2546271921686938 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 8, 'max_depth': 4, 'n_estimators': 916, 'learning_rate': 0.02136626790078726, 'subsample_freq': 3, 'subsample': 0.5507951750044494, 'reg_alpha': 0.03943451205409234, 'reg_lambda': 2.446425726678683, 'min_split_gain': 0.1644894183952468, 'min_child_samples': 194}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:21:13,880] Trial 12 finished with value: 0.2754832159976832 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 6, 'max_depth': 5, 'n_estimators': 800, 'learning_rate': 0.017977808407806, 'subsample_freq': 52, 'subsample': 0.5912635449416745, 'reg_alpha': 0.26935292412252265, 'reg_lambda': 0.00019703957642192658, 'min_split_gain': 0.12234697111021649, 'min_child_samples': 196}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:21:27,825] Trial 13 finished with value: 0.37093136155873774 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 6, 'max_depth': 3, 'n_estimators': 410, 'learning_rate': 0.011992545503385913, 'subsample_freq': 5, 'subsample': 0.40858329491002127, 'reg_alpha': 9.689342358707874, 'reg_lambda': 1.6180904955146648e-06, 'min_split_gain': 0.17774526603126897, 'min_child_samples': 73}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:21:32,670] Trial 14 finished with value: 0.2621688941162374 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 10, 'max_depth': 5, 'n_estimators': 104, 'learning_rate': 0.08631375256542186, 'subsample_freq': 52, 'subsample': 0.717054772127019, 'reg_alpha': 0.0020236699393156228, 'reg_lambda': 8.910038339659643, 'min_split_gain': 0.02186198011018835, 'min_child_samples': 168}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:21:41,647] Trial 15 finished with value: 0.4093498135021254 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 4, 'max_depth': 3, 'n_estimators': 470, 'learning_rate': 0.005910916184742595, 'subsample_freq': 44, 'subsample': 0.39493895706243476, 'reg_alpha': 0.008931076134889363, 'reg_lambda': 6.0873943911865375e-05, 'min_split_gain': 1.0046590117674399, 'min_child_samples': 84}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:22:01,814] Trial 16 finished with value: 0.24531630429830537 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 21, 'max_depth': 6, 'n_estimators': 842, 'learning_rate': 0.07430664780744822, 'subsample_freq': 16, 'subsample': 0.9872503774578288, 'reg_alpha': 1.5328250062664686e-06, 'reg_lambda': 0.0012100621048881954, 'min_split_gain': 7.424091935626773, 'min_child_samples': 1}. Best is trial 0 with value: 0.2375669640873192.
[I 2024-06-11 21:22:17,168] Trial 17 finished with value: 0.22822162357276213 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 29, 'max_depth': 6, 'n_estimators': 598, 'learning_rate': 0.13859385893011253, 'subsample_freq': 100, 'subsample': 0.9961912042950248, 'reg_alpha': 1.968464530502466e-06, 'reg_lambda': 0.0006138250417841504, 'min_split_gain': 4.684362949737192, 'min_child_samples': 60}. Best is trial 17 with value: 0.22822162357276213.
[I 2024-06-11 21:23:06,115] Trial 18 finished with value: 0.26459138959959244 and parameters: {'boosting_type': 'dart', 'num_leaves': 35, 'max_depth': 7, 'n_estimators': 611, 'learning_rate': 0.6216804592476746, 'subsample_freq': 97, 'subsample': 0.8693424343483102, 'reg_alpha': 8.205841869662816e-05, 'reg_lambda': 1.4974105416779763e-05, 'min_split_gain': 1.1714475956108583, 'min_child_samples': 63}. Best is trial 17 with value: 0.22822162357276213.
[I 2024-06-11 21:23:14,858] Trial 19 finished with value: 0.2564700314502418 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 80, 'max_depth': 6, 'n_estimators': 265, 'learning_rate': 0.167469489416445, 'subsample_freq': 64, 'subsample': 0.8673181192359238, 'reg_alpha': 2.1857070821672388e-06, 'reg_lambda': 0.0006460818601209819, 'min_split_gain': 0.013042573549799664, 'min_child_samples': 105}. Best is trial 17 with value: 0.22822162357276213.
[I 2024-06-11 21:23:34,396] Trial 20 finished with value: 0.5336117875321598 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 37, 'max_depth': 8, 'n_estimators': 453, 'learning_rate': 0.003521602439168646, 'subsample_freq': 96, 'subsample': 0.7665308197821114, 'reg_alpha': 0.000347257798335377, 'reg_lambda': 8.518014160716487e-05, 'min_split_gain': 9.340035977223543, 'min_child_samples': 12}. Best is trial 17 with value: 0.22822162357276213.
[I 2024-06-11 21:23:52,812] Trial 21 finished with value: 0.2406524488354712 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 18, 'max_depth': 6, 'n_estimators': 754, 'learning_rate': 0.06956240188430658, 'subsample_freq': 16, 'subsample': 0.9702235451938721, 'reg_alpha': 1.6631295613833915e-06, 'reg_lambda': 0.0013983884077387898, 'min_split_gain': 3.6396243492670335, 'min_child_samples': 5}. Best is trial 17 with value: 0.22822162357276213.
[I 2024-06-11 21:24:07,826] Trial 22 finished with value: 0.22019228078946243 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 14, 'max_depth': 7, 'n_estimators': 727, 'learning_rate': 0.04621302297344337, 'subsample_freq': 45, 'subsample': 0.9993769153995858, 'reg_alpha': 1.1331564873578928e-05, 'reg_lambda': 0.0030793494858755337, 'min_split_gain': 1.379712900200693, 'min_child_samples': 58}. Best is trial 22 with value: 0.22019228078946243.
[I 2024-06-11 21:24:19,155] Trial 23 finished with value: 0.22912527021962276 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 13, 'max_depth': 7, 'n_estimators': 515, 'learning_rate': 0.040066129087304544, 'subsample_freq': 41, 'subsample': 0.9073414853017843, 'reg_alpha': 1.980493849314478e-05, 'reg_lambda': 0.0035398624972791766, 'min_split_gain': 0.3385293253017387, 'min_child_samples': 54}. Best is trial 22 with value: 0.22019228078946243.
[I 2024-06-11 21:24:31,642] Trial 24 finished with value: 0.3429056267587316 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 11, 'max_depth': 7, 'n_estimators': 651, 'learning_rate': 0.891229038506383, 'subsample_freq': 62, 'subsample': 0.9894362160734298, 'reg_alpha': 1.1888190034363834e-05, 'reg_lambda': 0.0033921396767079364, 'min_split_gain': 1.1252197386451284, 'min_child_samples': 54}. Best is trial 22 with value: 0.22019228078946243.
[I 2024-06-11 21:24:42,547] Trial 25 finished with value: 0.2704955969005073 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 15, 'max_depth': 7, 'n_estimators': 546, 'learning_rate': 0.2690364024706393, 'subsample_freq': 40, 'subsample': 0.8299112642242664, 'reg_alpha': 1.643535789165306e-05, 'reg_lambda': 0.051376290645457214, 'min_split_gain': 0.04990239528093691, 'min_child_samples': 76}. Best is trial 22 with value: 0.22019228078946243.
[I 2024-06-11 21:25:42,772] Trial 26 finished with value: 0.21216617378336472 and parameters: {'boosting_type': 'dart', 'num_leaves': 52, 'max_depth': 8, 'n_estimators': 735, 'learning_rate': 0.041248642625343064, 'subsample_freq': 59, 'subsample': 0.6549241200135694, 'reg_alpha': 7.2309577202926565e-06, 'reg_lambda': 0.0021175229891408024, 'min_split_gain': 0.5921351210693561, 'min_child_samples': 33}. Best is trial 26 with value: 0.21216617378336472.
[I 2024-06-11 21:26:52,485] Trial 27 finished with value: 0.32160739399956184 and parameters: {'boosting_type': 'dart', 'num_leaves': 57, 'max_depth': 8, 'n_estimators': 755, 'learning_rate': 0.007460274561895293, 'subsample_freq': 87, 'subsample': 0.645531978583963, 'reg_alpha': 8.43831990025508e-06, 'reg_lambda': 0.019246845069877606, 'min_split_gain': 0.00043456415066986083, 'min_child_samples': 28}. Best is trial 26 with value: 0.21216617378336472.
[I 2024-06-11 21:28:19,733] Trial 28 finished with value: 0.24484255144786665 and parameters: {'boosting_type': 'dart', 'num_leaves': 53, 'max_depth': 8, 'n_estimators': 853, 'learning_rate': 0.2752197106899135, 'subsample_freq': 60, 'subsample': 0.7671479019947207, 'reg_alpha': 0.00042993331877219816, 'reg_lambda': 0.0002064216978670639, 'min_split_gain': 2.1907692488732717, 'min_child_samples': 41}. Best is trial 26 with value: 0.21216617378336472.
[I 2024-06-11 21:29:18,526] Trial 29 finished with value: 0.2219289566701418 and parameters: {'boosting_type': 'dart', 'num_leaves': 142, 'max_depth': 6, 'n_estimators': 635, 'learning_rate': 0.046373241369157756, 'subsample_freq': 89, 'subsample': 0.2885254416157427, 'reg_alpha': 4.714416480416875e-06, 'reg_lambda': 0.0002839982174823765, 'min_split_gain': 0.4261771897491709, 'min_child_samples': 21}. Best is trial 26 with value: 0.21216617378336472.
[I 2024-06-11 21:31:16,313] Trial 30 finished with value: 0.21690671973945994 and parameters: {'boosting_type': 'dart', 'num_leaves': 165, 'max_depth': 8, 'n_estimators': 708, 'learning_rate': 0.04467548374710433, 'subsample_freq': 90, 'subsample': 0.3011389947637438, 'reg_alpha': 5.499914887360909e-05, 'reg_lambda': 1.4962956262455427e-05, 'min_split_gain': 0.006934320364464691, 'min_child_samples': 21}. Best is trial 26 with value: 0.21216617378336472.
[I 2024-06-11 21:33:16,558] Trial 31 finished with value: 0.2036336873400536 and parameters: {'boosting_type': 'dart', 'num_leaves': 168, 'max_depth': 8, 'n_estimators': 738, 'learning_rate': 0.03497726161377542, 'subsample_freq': 89, 'subsample': 0.31806455482654183, 'reg_alpha': 2.9802256127304282e-05, 'reg_lambda': 9.716858735438152e-06, 'min_split_gain': 0.502816567954805, 'min_child_samples': 19}. Best is trial 31 with value: 0.2036336873400536.
[I 2024-06-11 21:35:33,432] Trial 32 finished with value: 0.19407801076973452 and parameters: {'boosting_type': 'dart', 'num_leaves': 196, 'max_depth': 8, 'n_estimators': 743, 'learning_rate': 0.013793545008518093, 'subsample_freq': 71, 'subsample': 0.34703151554704337, 'reg_alpha': 3.5089468346905795e-05, 'reg_lambda': 1.5495992293533664e-05, 'min_split_gain': 0.0054866110952060156, 'min_child_samples': 17}. Best is trial 32 with value: 0.19407801076973452.
[I 2024-06-11 21:38:10,728] Trial 33 finished with value: 0.25472316891288593 and parameters: {'boosting_type': 'dart', 'num_leaves': 183, 'max_depth': 8, 'n_estimators': 811, 'learning_rate': 0.008829717892530287, 'subsample_freq': 74, 'subsample': 0.32055598890618137, 'reg_alpha': 4.315201856897519e-05, 'reg_lambda': 1.158414179400753e-05, 'min_split_gain': 0.004896205469081608, 'min_child_samples': 18}. Best is trial 32 with value: 0.19407801076973452.
[I 2024-06-11 21:40:28,732] Trial 34 finished with value: 0.5910111921602035 and parameters: {'boosting_type': 'dart', 'num_leaves': 110, 'max_depth': 8, 'n_estimators': 905, 'learning_rate': 0.002547719346556354, 'subsample_freq': 69, 'subsample': 0.47759653797812934, 'reg_alpha': 0.00036983871816402596, 'reg_lambda': 1.3611206726224864e-06, 'min_split_gain': 0.0006123032178838724, 'min_child_samples': 35}. Best is trial 32 with value: 0.19407801076973452.
[I 2024-06-11 21:42:27,566] Trial 35 finished with value: 0.49722464223856944 and parameters: {'boosting_type': 'dart', 'num_leaves': 185, 'max_depth': 8, 'n_estimators': 776, 'learning_rate': 0.003808348768978165, 'subsample_freq': 90, 'subsample': 0.12354433550336724, 'reg_alpha': 3.613239681186159e-05, 'reg_lambda': 1.622250399814003e-05, 'min_split_gain': 0.005911927287373397, 'min_child_samples': 15}. Best is trial 32 with value: 0.19407801076973452.
[I 2024-06-11 21:43:57,165] Trial 36 finished with value: 0.19517242545584504 and parameters: {'boosting_type': 'dart', 'num_leaves': 124, 'max_depth': 8, 'n_estimators': 695, 'learning_rate': 0.015423255527552936, 'subsample_freq': 82, 'subsample': 0.32813505677436894, 'reg_alpha': 0.00014132735613786208, 'reg_lambda': 5.4721886551047804e-06, 'min_split_gain': 0.03480829540972325, 'min_child_samples': 32}. Best is trial 32 with value: 0.19407801076973452.
[I 2024-06-11 21:46:08,274] Trial 37 finished with value: 0.1937075069817935 and parameters: {'boosting_type': 'dart', 'num_leaves': 109, 'max_depth': 7, 'n_estimators': 992, 'learning_rate': 0.013177005735929986, 'subsample_freq': 78, 'subsample': 0.3487674004684075, 'reg_alpha': 0.0011535628740236218, 'reg_lambda': 3.646551450394125e-06, 'min_split_gain': 0.0660627001646925, 'min_child_samples': 41}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:48:20,974] Trial 38 finished with value: 0.79877176913575 and parameters: {'boosting_type': 'dart', 'num_leaves': 118, 'max_depth': 7, 'n_estimators': 986, 'learning_rate': 0.0010085206005493322, 'subsample_freq': 79, 'subsample': 0.38812694982292917, 'reg_alpha': 0.005650116804920632, 'reg_lambda': 6.207681848697464e-06, 'min_split_gain': 0.03777125451458298, 'min_child_samples': 45}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:50:17,025] Trial 39 finished with value: 0.9302163497824414 and parameters: {'boosting_type': 'dart', 'num_leaves': 95, 'max_depth': 7, 'n_estimators': 989, 'learning_rate': 1.1495930315979298e-05, 'subsample_freq': 69, 'subsample': 0.2452896809935798, 'reg_alpha': 0.001172603144212041, 'reg_lambda': 2.951652525343636e-06, 'min_split_gain': 0.0017261215538158707, 'min_child_samples': 9}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:51:42,220] Trial 40 finished with value: 0.19633870836667147 and parameters: {'boosting_type': 'dart', 'num_leaves': 70, 'max_depth': 8, 'n_estimators': 862, 'learning_rate': 0.013794126455383394, 'subsample_freq': 75, 'subsample': 0.3375736187790048, 'reg_alpha': 0.0009995517052144762, 'reg_lambda': 3.4882272015783835e-05, 'min_split_gain': 0.059587896012964395, 'min_child_samples': 44}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:53:08,802] Trial 41 finished with value: 0.2023876119014364 and parameters: {'boosting_type': 'dart', 'num_leaves': 71, 'max_depth': 8, 'n_estimators': 882, 'learning_rate': 0.012778831080936207, 'subsample_freq': 78, 'subsample': 0.3465200415275098, 'reg_alpha': 0.00016035165173707328, 'reg_lambda': 4.865080070902081e-06, 'min_split_gain': 0.05956657954778257, 'min_child_samples': 31}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:54:38,828] Trial 42 finished with value: 0.19953957861774976 and parameters: {'boosting_type': 'dart', 'num_leaves': 77, 'max_depth': 8, 'n_estimators': 868, 'learning_rate': 0.01419091759181461, 'subsample_freq': 75, 'subsample': 0.48465476408070063, 'reg_alpha': 0.0008458488871888944, 'reg_lambda': 3.25979796428988e-05, 'min_split_gain': 0.06384000488745972, 'min_child_samples': 42}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:56:46,575] Trial 43 finished with value: 0.6497339979567147 and parameters: {'boosting_type': 'dart', 'num_leaves': 129, 'max_depth': 7, 'n_estimators': 953, 'learning_rate': 0.0021232672651863307, 'subsample_freq': 74, 'subsample': 0.4847022381295954, 'reg_alpha': 0.0008776075966172611, 'reg_lambda': 4.0206864622707646e-05, 'min_split_gain': 0.0772035303450651, 'min_child_samples': 69}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:58:12,381] Trial 44 finished with value: 0.3542883170072623 and parameters: {'boosting_type': 'dart', 'num_leaves': 70, 'max_depth': 8, 'n_estimators': 858, 'learning_rate': 0.005577678650019324, 'subsample_freq': 81, 'subsample': 0.23883843248178988, 'reg_alpha': 0.009502654072650671, 'reg_lambda': 2.9146653784604022e-05, 'min_split_gain': 0.018400520233908497, 'min_child_samples': 41}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 21:59:59,306] Trial 45 finished with value: 0.2195262518214194 and parameters: {'boosting_type': 'dart', 'num_leaves': 104, 'max_depth': 8, 'n_estimators': 804, 'learning_rate': 0.010861619547116149, 'subsample_freq': 85, 'subsample': 0.5118115771721503, 'reg_alpha': 0.00011768535139456489, 'reg_lambda': 9.651297717533395e-05, 'min_split_gain': 0.0007231559859195192, 'min_child_samples': 93}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 22:01:59,753] Trial 46 finished with value: 0.1967567733207667 and parameters: {'boosting_type': 'dart', 'num_leaves': 199, 'max_depth': 7, 'n_estimators': 938, 'learning_rate': 0.019909215809693293, 'subsample_freq': 65, 'subsample': 0.36928053165391966, 'reg_alpha': 0.002888022127230769, 'reg_lambda': 3.4774554854704484e-06, 'min_split_gain': 0.027952426523440432, 'min_child_samples': 47}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 22:03:14,895] Trial 47 finished with value: 0.1962477644240738 and parameters: {'boosting_type': 'dart', 'num_leaves': 132, 'max_depth': 5, 'n_estimators': 948, 'learning_rate': 0.02103220173018322, 'subsample_freq': 66, 'subsample': 0.4301569678831137, 'reg_alpha': 0.00429430117697604, 'reg_lambda': 2.582361535226878e-06, 'min_split_gain': 0.0025903009258607733, 'min_child_samples': 121}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 22:04:36,413] Trial 48 finished with value: 0.3749422025768455 and parameters: {'boosting_type': 'dart', 'num_leaves': 138, 'max_depth': 5, 'n_estimators': 998, 'learning_rate': 0.005039665399722639, 'subsample_freq': 58, 'subsample': 0.4286559752119982, 'reg_alpha': 0.10575419582016256, 'reg_lambda': 1.0316876467059006e-06, 'min_split_gain': 0.0001365315686171329, 'min_child_samples': 127}. Best is trial 37 with value: 0.1937075069817935.
[I 2024-06-11 22:04:47,697] Trial 49 finished with value: 0.9168251644984202 and parameters: {'boosting_type': 'dart', 'num_leaves': 2, 'max_depth': 3, 'n_estimators': 666, 'learning_rate': 0.0005252331632867456, 'subsample_freq': 67, 'subsample': 0.17557784519168965, 'reg_alpha': 0.014907554663939108, 'reg_lambda': 2.590340754889417e-06, 'min_split_gain': 0.002365743871858019, 'min_child_samples': 145}. Best is trial 37 with value: 0.1937075069817935.
study_T.best_params
{'boosting_type': 'dart',
'num_leaves': 109,
'max_depth': 7,
'n_estimators': 992,
'learning_rate': 0.013177005735929986,
'subsample_freq': 78,
'subsample': 0.3487674004684075,
'reg_alpha': 0.0011535628740236218,
'reg_lambda': 3.646551450394125e-06,
'min_split_gain': 0.0660627001646925,
'min_child_samples': 41}
{'boosting_type': 'dart', 'num_leaves': 109, 'max_depth': 7, 'n_estimators': 992, 'learning_rate': 0.013177005735929986, 'subsample_freq': 78, 'subsample': 0.3487674004684075, 'reg_alpha': 0.0011535628740236218, 'reg_lambda': 3.646551450394125e-06, 'min_split_gain': 0.0660627001646925, 'min_child_samples': 41}
models = [LGBMRegressor(n_jobs=-1, **study_T.best_params)]
mlf = MLForecast(
models=models,
freq='W-SAT',
lags=range(1, 2*52 + 1),
date_features=['month', 'year'],
)
crossvalidation_df = mlf.cross_validation(
df=df_train,
static_features=['dpt_num_department', 'but_region_idr_region', 'zod_idr_zone_dgr'],
**cv_params
)
lgbm_res = make_res_df(crossvalidation_df, model_name='LGBMRegressor', metric=wmape)
As previously mentioned, we evaluate all the models using time-based 4-fold cross-validation, with each validation period being 8 weeks long.
Following the guidelines from "Time-Series Forecast at Scale: Data, Modeling, and Monitoring Approaches," we use the WMAPE (Weighted Mean Absolute Percentage Error) evaluation metric.
[ \text{WMAPE} = \frac{\sum_{t=1}^{n} |A_t - Ft|}{\sum{t=1}^{n} |A_t|} ]
As we will see, the accuracy varies significantly across different departments. The ranking of various models may also differ between departments, so it could be beneficial to use different models or even model families for different departments. It would also be interesting to examine the breakdown by geographical region.
Finally, we will visually inspect the obtained forecasts to see if we can spot any apparent problems.
all_res = all_res.merge(lgbm_res, on='dep name')
all_res
| dep name | Naive wmape | SeasonalNaive wmape | ARIMA212 wmape | ARIMA511 wmape | LGBMRegressor wmape | |
|---|---|---|---|---|---|---|
| 0 | global | 0.552005 | 0.647247 | 0.525967 | 0.484144 | 0.193708 |
| 1 | 73 | 0.429084 | 0.771686 | 0.398694 | 0.378727 | 0.313699 |
| 2 | 88 | 0.218117 | 0.189890 | 0.181112 | 0.182256 | 0.165174 |
| 3 | 117 | 0.920356 | 0.843363 | 1.405544 | 9.392929 | 1.518606 |
| 4 | 127 | 0.662031 | 0.773375 | 0.637668 | 0.548976 | 0.177332 |
plot_cv(df_train, crossvalidation_df, '10_73')
plot_cv(df_train, crossvalidation_df, '10_88')
plot_cv(df_train, crossvalidation_df, '522_127')
plot_cv(df_train, crossvalidation_df, '10_117')
mlf.fit(df_train_full)
preds = mlf.predict(8)
preds
| unique_id | ds | LGBMRegressor | |
|---|---|---|---|
| 0 | 100_117 | 2017-10-07 | 160.880943 |
| 1 | 100_117 | 2017-10-14 | 453.005339 |
| 2 | 100_117 | 2017-10-21 | 715.843420 |
| 3 | 100_117 | 2017-10-28 | 859.946228 |
| 4 | 100_117 | 2017-11-04 | 847.869578 |
| ... | ... | ... | ... |
| 10155 | 9_88 | 2017-10-28 | 230.623109 |
| 10156 | 9_88 | 2017-11-04 | 222.331294 |
| 10157 | 9_88 | 2017-11-11 | 212.713810 |
| 10158 | 9_88 | 2017-11-18 | 191.348236 |
| 10159 | 9_88 | 2017-11-25 | 201.818013 |
10160 rows × 3 columns
df_test.dtypes
but_num_business_unit int64 dpt_num_department int64 unique_id object ds datetime64[ns] but_postcode int64 but_latitude float64 but_longitude float64 but_region_idr_region int64 zod_idr_zone_dgr int64 dtype: object
preds.dtypes
unique_id object ds datetime64[ns] LGBMRegressor float64 dtype: object
df_test_ = df_test.merge(preds, on=['unique_id', 'ds'], how='left')
df_test_
| but_num_business_unit | dpt_num_department | unique_id | ds | but_postcode | but_latitude | but_longitude | but_region_idr_region | zod_idr_zone_dgr | LGBMRegressor | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 73 | 95_73 | 2017-11-25 | 80000 | 49.869382 | 2.280452 | 69 | 4 | 18.764482 |
| 1 | 4 | 117 | 4_117 | 2017-11-25 | 6600 | 43.600994 | 7.078160 | 55 | 10 | 1207.392992 |
| 2 | 113 | 127 | 113_127 | 2017-11-25 | 84014 | 43.919562 | 4.867583 | 115 | 10 | 838.583508 |
| 3 | 93 | 117 | 93_117 | 2017-11-25 | 13008 | 43.239744 | 5.396694 | 71 | 10 | 627.448898 |
| 4 | 66 | 127 | 66_127 | 2017-11-25 | 34500 | 43.347835 | 3.255024 | 6 | 10 | 1535.691493 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10131 | 61 | 88 | 61_88 | 2017-10-07 | 60740 | 49.238738 | 2.468513 | 69 | 4 | 501.292935 |
| 10132 | 641 | 117 | 641_117 | 2017-10-07 | 17810 | 45.749749 | -0.675981 | 70 | 10 | 40.081155 |
| 10133 | 724 | 117 | 724_117 | 2017-10-07 | 38150 | 45.327709 | 4.804922 | 52 | 4 | 64.537820 |
| 10134 | 1302 | 117 | 1302_117 | 2017-10-07 | 74950 | 46.069548 | 6.549448 | 51 | 4 | 130.758620 |
| 10135 | 1126 | 88 | 1126_88 | 2017-10-07 | 60590 | 49.286484 | 1.799250 | 178 | 72 | 55.860540 |
10136 rows × 10 columns
df_test_raw = pd.read_parquet('data-ds-test/data/test.parquet')
results = df_test_[['ds','but_num_business_unit', 'dpt_num_department', 'LGBMRegressor']]
results.columns = ['day_id','but_num_business_unit', 'dpt_num_department', 'y']
results
| day_id | but_num_business_unit | dpt_num_department | y | |
|---|---|---|---|---|
| 0 | 2017-11-25 | 95 | 73 | 18.764482 |
| 1 | 2017-11-25 | 4 | 117 | 1207.392992 |
| 2 | 2017-11-25 | 113 | 127 | 838.583508 |
| 3 | 2017-11-25 | 93 | 117 | 627.448898 |
| 4 | 2017-11-25 | 66 | 127 | 1535.691493 |
| ... | ... | ... | ... | ... |
| 10131 | 2017-10-07 | 61 | 88 | 501.292935 |
| 10132 | 2017-10-07 | 641 | 117 | 40.081155 |
| 10133 | 2017-10-07 | 724 | 117 | 64.537820 |
| 10134 | 2017-10-07 | 1302 | 117 | 130.758620 |
| 10135 | 2017-10-07 | 1126 | 88 | 55.860540 |
10136 rows × 4 columns
results.to_parquet('data-ds-test/data/test_predictions.parquet')
plot_test_predictions(df_train_full, df_test_, ['10_73', '10_88', '10_117', '10_127'])
plot_test_predictions(df_train_full, df_test_, ['724_73', '724_88', '724_117', '724_127'])
plot_test_predictions(df_train_full, df_test_, ['117_73', '117_88', '117_117', '117_127'])